Read in the data

This is a classical dataset about Breast cancer from Kaggle. https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data In some pervious reasearch, some machine learning methods are used to do the binary classification. The dataset here contains 10 dimensions clincal features

library(readr)
library(ggplot2)
library(dplyr)
## Warning: 程辑包'dplyr'是用R版本4.2.3 来建造的
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ tidyr   1.2.1     ✔ forcats 1.0.0
## ✔ purrr   1.0.0
## Warning: 程辑包'tibble'是用R版本4.2.3 来建造的
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## Warning: 程辑包'tidymodels'是用R版本4.2.3 来建造的
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.3     ✔ rsample      1.1.1
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.4     ✔ yardstick    1.1.0
## ✔ recipes      1.0.5
## Warning: 程辑包'dials'是用R版本4.2.3 来建造的
## Warning: 程辑包'infer'是用R版本4.2.3 来建造的
## Warning: 程辑包'modeldata'是用R版本4.2.3 来建造的
## Warning: 程辑包'parsnip'是用R版本4.2.3 来建造的
## Warning: 程辑包'rsample'是用R版本4.2.3 来建造的
## Warning: 程辑包'tune'是用R版本4.2.3 来建造的
## Warning: 程辑包'workflows'是用R版本4.2.3 来建造的
## Warning: 程辑包'workflowsets'是用R版本4.2.3 来建造的
## Warning: 程辑包'yardstick'是用R版本4.2.3 来建造的
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(patchwork)
library(embed)
## Warning: 程辑包'embed'是用R版本4.2.3 来建造的
url <- "https://raw.githubusercontent.com/900Step/Stat-436/main/cancer.csv"

cancer <- read_csv(url)
## Rows: 569 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): diagnosis
## dbl (11): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# cancer
pca_result <- prcomp(cancer[,3:12], center = TRUE, scale. = TRUE)
# pc_weights
# summary(pc_weights)
pca_summary <- summary(pca_result)

pca_summary 
## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.3406 1.5870 0.93841 0.7064 0.61036 0.35234 0.28299
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241 0.00801
## Cumulative Proportion  0.5479 0.7997 0.88779 0.9377 0.97495 0.98736 0.99537
##                            PC8     PC9    PC10
## Standard deviation     0.18679 0.10552 0.01680
## Proportion of Variance 0.00349 0.00111 0.00003
## Cumulative Proportion  0.99886 0.99997 1.00000
# prop_var <- pca_summary$sdev^2 / sum(pca_summary$sdev^2)
# prop_var
# sum(prop_var[1:5])/sum(prop_var)
diagnosis = cancer[,c(1,2)]



pca_recipe <- recipe(~., data = cancer) %>%
  # update_role(id, starts_with("group"), new_role = "id") %>%
  update_role(id, diagnosis, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## predictor: 10
## id:         2
## 
## ── Operations
## • Centering and scaling for: all_predictors()
## • PCA extraction with: all_predictors()
pca_prep <- prep(pca_recipe)
pca_result <- tidy(pca_prep, 2)
pca_result %>% filter(component %in% str_c("PC", 1:5))
## # A tibble: 50 × 4
##    terms                    value component id       
##    <chr>                    <dbl> <chr>     <chr>    
##  1 radius_mean            -0.364  PC1       pca_Kj431
##  2 texture_mean           -0.154  PC1       pca_Kj431
##  3 perimeter_mean         -0.376  PC1       pca_Kj431
##  4 area_mean              -0.364  PC1       pca_Kj431
##  5 smoothness_mean        -0.232  PC1       pca_Kj431
##  6 compactness_mean       -0.364  PC1       pca_Kj431
##  7 concavity_mean         -0.396  PC1       pca_Kj431
##  8 concave_points_mean    -0.418  PC1       pca_Kj431
##  9 symmetry_mean          -0.215  PC1       pca_Kj431
## 10 fractal_dimension_mean -0.0718 PC1       pca_Kj431
## # … with 40 more rows
# pca_result %>% filter(colnames(pca_result$rotation) %in% str_c("PC", 1:5))

# pca_result

ggplot(pca_result %>% filter(component %in% str_c("PC", 1:5))) +
  geom_col(aes(x = value, y = terms)) +
  facet_wrap(~ component) +
  labs(x = "Component", y = "Features")

pca_scores <- bake(pca_prep, cancer)
# pca_scores
ggplot(pca_scores)+
  geom_point(aes(x = PC1, y = PC2, color = diagnosis))

group_order <- pca_scores %>%
  group_by(diagnosis) %>%
  summarise(mpc2 = mean(PC2)) %>%
  arrange(mpc2)

group_order
## # A tibble: 2 × 2
##   diagnosis   mpc2
##   <fct>      <dbl>
## 1 B         -0.171
## 2 M          0.288
# pca_scores %>%
#   mutate(group = factor(diagnosis, levels = group_order)) %>%
#   ggplot(aes(x = PC1, y = PC2)) +
#   geom_vline(xintercept = 0, col = "#4a4a4a") +
#   geom_hline(yintercept = 0, col = "#4a4a4a") +
#   geom_point(size = 0.4, alpha = 0.6) +
#   scale_x_continuous(breaks = seq(-8, 0, length.out = 3)) +
#   scale_color_brewer(palette = "Set2") +
#   facet_wrap(~ reorder(group, PC1), ncol = 9) +
#   coord_fixed() +
#   theme(strip.text = element_text(size = 8))

library(plotly)
## 
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Create some sample data


# Create a 3D scatter plot
fig <- plot_ly(x = pca_scores$PC1, y = pca_scores$PC2, z = pca_scores$PC3, type = "scatter3d", mode = "markers", color = pca_scores$diagnosis)
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

UMAP

# create a shiny app for the n_neighbor


umap_rec <- recipe(~., data = cancer) %>%
  update_role(id, diagnosis, new_role = "id") %>%
  step_umap(all_predictors(), learn_rate = 0.1, neighbors = 20)
umap_prep <- prep(umap_rec)
umap_prep 
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## predictor: 10
## id:         2
## 
## ── Training information
## Training data contained 569 data points and no incomplete rows.
## 
## ── Operations
## • UMAP embedding for: radius_mean, texture_mean, perimeter_mean, ... | Trained
embeddings <- bake(umap_prep, new_data = cancer) %>%
  left_join(cancer)
## Joining with `by = join_by(id, diagnosis)`
ggplot(embeddings) +
  geom_point(aes(UMAP1, UMAP2, group = diagnosis, col = diagnosis), alpha = 0.4) +
  scale_color_brewer(palette = "Set2")

# embeddings$diagnosis
# , col = as.factor(diagnosis)
library(shiny)
## 
## 载入程辑包:'shiny'
## The following object is masked from 'package:infer':
## 
##     observe
ui<-fluidPage(
  titlePanel("The UMAP plot of the breast cance dataset"),
  numericInput("input_n", "n_neighbors: ", value = 50, min = 5, max = 200),
  plotOutput("umap")
)


server <- function(input, output){
  
  umap_prep <- reactive({
    recipe(~., data = cancer) %>%
    update_role(id, diagnosis, new_role = "id") %>%
    step_umap(all_predictors(), learn_rate = 0.1, neighbors = input$input_n)%>%
    prep()
    
  })
  umap_embeddings <- reactive({
    bake(umap_prep(), new_data = cancer) %>%
    left_join(cancer)
    
  })
  
  output$umap <- renderPlot({
    ggplot(umap_embeddings())+
  geom_point(aes(UMAP1, UMAP2, group = diagnosis, col = diagnosis), alpha = 0.4) +
  scale_color_brewer(palette = "Set2")
  })
}

# shinyApp(ui, server)